import pandas as pd
import numpy as np
import re
from pathlib import Path
import plotly.express as px
pd.options.plotting.backend = 'plotly'
from lec_utils import *
# from save_data import *
Step 1: Introduction¶
base_path = Path("/Users/yipho/eecs398/portfolio/allyears")
output_path = Path("/Users/yipho/eecs398/portfolio/raw_data")
output_path.mkdir(parents=True, exist_ok=True)
years_to_process = range(2000, 2024)
def rename_case_id_to_respondent_id(df):
if "CASEID" in df.columns and "RESPONDENT_ID" not in df.columns:
df.rename(columns={"CASEID": "RESPONDENT_ID"}, inplace=True)
return df
def load_and_save_icpsr_data(base_path, output_path, years_to_process):
for year in years_to_process:
print(f"Processing year {year}...")
icpsr_folders = list(base_path.glob(f"ICPSR_*{year}"))
print(icpsr_folders)
if not icpsr_folders:
print(f"No ICPSR folder found for year {year}. Skipping...")
continue
merged_data= []
for folder in icpsr_folders:
study_number = folder.name.split("_")[1].split("-")[0]
form1_path = folder / f"DS0001/{study_number}-0001-Data.dta"
form6_path = folder / f"DS0006/{study_number}-0006-Data.dta"
# Load and merge Form 1 and Form 6 if both exist
if form1_path.exists() and form6_path.exists():
# print(f"Found Form 1 and Form 6 data for year {year}.")
try:
# Load Form 1
df1 = pd.read_stata(form1_path)
print(f"Loaded Form 1 with shape: {df1.shape}")
df1 = rename_case_id_to_respondent_id(df1)
# Load Form 6
df6 = pd.read_stata(form6_path)
print(f"Loaded Form 6 with shape: {df6.shape}")
df6 = rename_case_id_to_respondent_id(df6)
if "RESPONDENT_ID" in df1.columns and "RESPONDENT_ID" in df6.columns:
df_merged = df1.merge(df6, on="RESPONDENT_ID", how="inner")
print(f"Merged data shape: {df_merged.shape}")
df_merged["Year"] = year
merged_data.append(df_merged)
else:
print(f"'RESPONDENT_ID' column missing in Form 1 or Form 6 for year {year}. Skipping merge.")
except Exception as e:
print(f"Error processing Form 1 and Form 6 for {year}: {e}")
else:
if not form1_path.exists():
print(f"Form 1 data not found for year {year}: {form1_path}")
if not form6_path.exists():
print(f"Form 6 data not found for year {year}: {form6_path}")
if merged_data:
year_df = pd.concat(merged_data, axis=0)
output_file = output_path / f"ICPSR_data_{year}.csv"
year_df.to_csv(output_file, index=False)
print(f"Saved merged data for year {year} to {output_file}")
else:
print(f"No merged data found for year {year}.")
load_and_save_icpsr_data(base_path, output_path, years_to_process)
#god bless Kerby Shedden
Step 2: Data Cleaning and Exploratory Data Analysis¶
import pandas as pd
from pathlib import Path
base_path = Path("/Users/yipho/eecs398/portfolio/raw_data")
output_path = Path("//Users/yipho/eecs398/portfolio/unprocessed_data")
output_path.mkdir(parents=True, exist_ok=True)
variable_mapping = {
"POL_BELIEFS": {
(2000, 2023): "V5167",
},
"SEX" : {
(2000, 2023): "V5150",
},
"NUM_SIBS": {
(2000, 2023): "V49_x",
},
"BR_SR_inhouse": {
(2000, 2011): "V157",
(2012, 2023): "V2157",
},
"FATHR_PRES": {
(2000, 2023): "V5155",
},
"MOTHR_PRES": {
(2000, 2023): "V5156",
},
"LONELY": {
(2000, 2023): "V5313",
},
"WISH_MORE_FRNDS": {
(2000, 2023): "V5321",
},
"USLLY_FRNDS": {
(2000, 2023): "V5324",
},
}
def get_variable_for_year(variable_name, year):
for year_range, var in variable_mapping[variable_name].items():
if year_range[0] <= year <= year_range[1]:
return var
return None
def rename_variables(df, year):
renamed_columns = {}
for logic_name, year_mapping in variable_mapping.items():
column_name = get_variable_for_year(logic_name, year)
if column_name and column_name in df.columns:
renamed_columns[column_name] = logic_name
df = df.rename(columns=renamed_columns)
print(f"Renamed columns for year {year}: {renamed_columns}")
return df
def clean_and_process_data(df, year):
df = rename_variables(df, year)
cols_interest = [
"RESPONDENT_ID",
"V1_x",
"SEX",
"POL_BELIEFS",
"NUM_SIBS",
"BR_SR_inhouse",
"FATHR_PRES",
"MOTHR_PRES",
"LONELY",
"WISH_MORE_FRNDS",
"USLLY_FRNDS",
]
cols_interest = [col for col in cols_interest if col in df.columns]
dfmain = df[cols_interest]
# Clean variables where needed
cols_clean = ["NUM_SIBS", "SEX", "POL_BELIEFS", "BR_SR_inhouse", "FATHR_PRES", "MOTHR_PRES", "LONELY", "WISH_MORE_FRNDS", "USLLY_FRNDS"]
cols_clean = [col for col in cols_clean if col in df.columns]
# def extract_number(column):
# pattern = r".+:\s*\((\d+)\)"
# return column.apply(lambda x: int(re.match(pattern, str(x)).group(1)) if re.match(pattern, str(x)) else None)
# for col in cols_clean:
# dfmain[col] = extract_number(dfmain[col])
def extract_number(column):
pattern1 = r".+:\s*\((-?\d+)\)"
pattern2 = r"(\d+)"
def parse_value(value):
value_str = str(value).strip()
if re.match(pattern1, value_str):
return int(re.match(pattern1, value_str).group(1))
elif re.match(pattern2, value_str):
return int(re.match(pattern2, value_str).group(1))
else:
return None
return column.apply(parse_value)
for col in cols_clean:
dfmain[col] = extract_number(dfmain[col])
if "POL_BELIEFS" in dfmain.columns:
dfmain = dfmain.dropna(subset=["POL_BELIEFS"])
# Drop invalid values for SEX
if "SEX" in dfmain.columns:
dfmain = dfmain.dropna(subset=["SEX"])
if 'NUM_SIBS' in dfmain.columns:
dfmain = dfmain.dropna(subset=['NUM_SIBS'])
if 'BR_SR_inhouse' in dfmain.columns:
dfmain = dfmain.dropna(subset=['BR_SR_inhouse'])
if 'FATHR_PRES' in dfmain.columns:
dfmain = dfmain.dropna(subset=['FATHR_PRES'])
if 'MOTHR_PRES' in dfmain.columns:
dfmain = dfmain.dropna(subset=['MOTHR_PRES'])
if 'LONELY' in dfmain.columns:
dfmain = dfmain.dropna(subset=['LONELY'])
dfmain['LONELY'] = dfmain['LONELY'].astype(int)
#if wish more friends missing
if 'WISH_MORE_FRNDS' in dfmain.columns:
dfmain = dfmain.dropna(subset=['WISH_MORE_FRNDS'])
#if usually friends missing
if 'USLLY_FRNDS' in dfmain.columns:
dfmain = dfmain.dropna(subset=['USLLY_FRNDS'])
return dfmain
# Process each CSV file based on year
def process_raw_data(base_path, output_path):
for csv_file in base_path.glob("ICPSR_data_*.csv"):
try:
# Extract year from file name
year = int(csv_file.stem.split("_")[-1])
# print(f"Processing file for year {year}: {csv_file}")
# Load data
df = pd.read_csv(csv_file)
# print(f"Loaded data with shape: {df.shape}")
# Process data
df_processed = clean_and_process_data(df, year)
# print(f"Processed data shape: {df_processed.shape}")
# Save processed data
output_file = output_path / f"data_{year}.csv"
df_processed.to_csv(output_file, index=False)
print(f"Saved processed data for year {year} to {output_file}")
except Exception as e:
print(f"Error processing file {csv_file}: {e}")
process_raw_data(base_path, output_path)
Renamed columns for year 2013: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2013 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2013.csv
Renamed columns for year 2007: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2007 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2007.csv
Renamed columns for year 2006: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2006 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2006.csv
Renamed columns for year 2012: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2012 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2012.csv
Renamed columns for year 2004: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2004 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2004.csv
Renamed columns for year 2010: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2010 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2010.csv
Renamed columns for year 2011: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2011 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2011.csv
Renamed columns for year 2005: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2005 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2005.csv
Renamed columns for year 2001: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2001 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2001.csv
Renamed columns for year 2015: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2015 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2015.csv
Renamed columns for year 2014: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2014 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2014.csv
Renamed columns for year 2000: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2000 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2000.csv
Renamed columns for year 2016: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2016 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2016.csv
Renamed columns for year 2002: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2002 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2002.csv
Renamed columns for year 2003: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2003 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2003.csv
Renamed columns for year 2017: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2017 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2017.csv
Renamed columns for year 2019: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2019 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2019.csv
Renamed columns for year 2018: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2018 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2018.csv
Renamed columns for year 2008: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2008 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2008.csv
Renamed columns for year 2020: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2020 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2020.csv
Renamed columns for year 2021: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2021 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2021.csv
Renamed columns for year 2009: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2009 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2009.csv
Renamed columns for year 2023: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2023 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2023.csv
Renamed columns for year 2022: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2022 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2022.csv
base_path = Path("/Users/yipho/eecs398/portfolio/raw_data")
output_path = Path("/Users/yipho/eecs398/portfolio/processed_data")
output_path.mkdir(parents=True, exist_ok=True)
variable_mapping = {
"POL_BELIEFS": {
(2000, 2023): "V5167",
},
"SEX" : {
(2000, 2023): "V5150",
},
"NUM_SIBS": {
(2000, 2023): "V49_x",
},
"BR_SR_inhouse": {
(2000, 2011): "V157",
(2012, 2023): "V2157",
},
"FATHR_PRES": {
(2000, 2023): "V5155",
},
"MOTHR_PRES": {
(2000, 2023): "V5156",
},
"LONELY": {
(2000, 2023): "V5313",
},
"WISH_MORE_FRNDS": {
(2000, 2023): "V5321",
},
"USLLY_FRNDS": {
(2000, 2023): "V5324",
},
}
def get_variable_for_year(variable_name, year):
for year_range, var in variable_mapping[variable_name].items():
if year_range[0] <= year <= year_range[1]:
return var
return None
def rename_variables(df, year):
renamed_columns = {}
for logic_name, year_mapping in variable_mapping.items():
column_name = get_variable_for_year(logic_name, year)
if column_name and column_name in df.columns:
renamed_columns[column_name] = logic_name
df = df.rename(columns=renamed_columns)
print(f"Renamed columns for year {year}: {renamed_columns}")
return df
def clean_and_process_data(df, year):
df = rename_variables(df, year)
cols_interest = [
"RESPONDENT_ID",
"V1_x",
"SEX",
"POL_BELIEFS",
"NUM_SIBS",
"BR_SR_inhouse",
"FATHR_PRES",
"MOTHR_PRES",
"LONELY",
"WISH_MORE_FRNDS",
"USLLY_FRNDS",
]
cols_interest = [col for col in cols_interest if col in df.columns]
dfmain = df[cols_interest]
# Clean variables where needed
cols_clean = ["NUM_SIBS", "SEX", "POL_BELIEFS", "BR_SR_inhouse", "FATHR_PRES", "MOTHR_PRES", "LONELY", "WISH_MORE_FRNDS", "USLLY_FRNDS"]
cols_clean = [col for col in cols_clean if col in df.columns]
# def extract_number(column):
# pattern = r".+:\s*\((\d+)\)"
# return column.apply(lambda x: int(re.match(pattern, str(x)).group(1)) if re.match(pattern, str(x)) else None)
# for col in cols_clean:
# dfmain[col] = extract_number(dfmain[col])
def extract_number(column):
pattern1 = r".+:\s*\((-?\d+)\)"
pattern2 = r"(\d+)"
def parse_value(value):
value_str = str(value).strip()
if re.match(pattern1, value_str):
return int(re.match(pattern1, value_str).group(1))
elif re.match(pattern2, value_str):
return int(re.match(pattern2, value_str).group(1))
else:
return None
return column.apply(parse_value)
for col in cols_clean:
dfmain[col] = extract_number(dfmain[col])
if "POL_BELIEFS" in dfmain.columns:
dfmain = dfmain[dfmain["POL_BELIEFS"].isin([6, 8, -9]) == False]
dfmain = dfmain.dropna(subset=["POL_BELIEFS"])
dfmain["POL_BELIEFS"] = dfmain["POL_BELIEFS"] - 1 # Rescale to start from 0
# Drop invalid values for SEX
if "SEX" in dfmain.columns:
dfmain = dfmain[dfmain["SEX"].isin([-9, 3, 4]) == False]
dfmain = dfmain.dropna(subset=["SEX"])
dfmain["SEX"] = dfmain["SEX"] - 1 # 0 for male, 1 for female
if 'NUM_SIBS' in dfmain.columns:
dfmain = dfmain[dfmain["NUM_SIBS"].isin([-9]) == False]
dfmain = dfmain.dropna(subset=['NUM_SIBS'])
if 'BR_SR_inhouse' in dfmain.columns:
dfmain = dfmain[dfmain["BR_SR_inhouse"].isin([-9]) == False]
dfmain = dfmain.dropna(subset=['BR_SR_inhouse'])
if 'FATHR_PRES' in dfmain.columns:
dfmain = dfmain[dfmain["FATHR_PRES"].isin([-9]) == False]
dfmain = dfmain.dropna(subset=['FATHR_PRES'])
if 'MOTHR_PRES' in dfmain.columns:
dfmain = dfmain[dfmain["MOTHR_PRES"].isin([-9]) == False]
dfmain = dfmain.dropna(subset=['MOTHR_PRES'])
#if lonely missing
if 'LONELY' in dfmain.columns:
dfmain = dfmain.dropna(subset=['LONELY'])
dfmain = dfmain[dfmain["LONELY"].isin([-9]) == False]
dfmain['LONELY'] = dfmain['LONELY'].astype(int)
dfmain['LONELY'] = dfmain['LONELY'] - 1
#if wish more friends missing
if 'WISH_MORE_FRNDS' in dfmain.columns:
dfmain = dfmain[dfmain["WISH_MORE_FRNDS"].isin([-9]) == False]
dfmain = dfmain.dropna(subset=['WISH_MORE_FRNDS'])
dfmain['WISH_MORE_FRNDS'] = dfmain['WISH_MORE_FRNDS'] - 1
#if usually friends missing
if 'USLLY_FRNDS' in dfmain.columns:
#drop -9
dfmain = dfmain[dfmain["USLLY_FRNDS"].isin([-9]) == False]
dfmain = dfmain.dropna(subset=['USLLY_FRNDS'])
dfmain['USLLY_FRNDS'] = dfmain['USLLY_FRNDS'] - 1
return dfmain
# Process each CSV file based on year
def process_raw_data(base_path, output_path):
for csv_file in base_path.glob("ICPSR_data_*.csv"):
try:
# Extract year from file name
year = int(csv_file.stem.split("_")[-1])
# print(f"Processing file for year {year}: {csv_file}")
# Load data
df = pd.read_csv(csv_file)
# print(f"Loaded data with shape: {df.shape}")
# Process data
df_processed = clean_and_process_data(df, year)
# print(f"Processed data shape: {df_processed.shape}")
# Save processed data
output_file = output_path / f"data_{year}.csv"
df_processed.to_csv(output_file, index=False)
# print(f"Saved processed data for year {year} to {output_file}")
except Exception as e:
print(f"Error processing file {csv_file}: {e}")
process_raw_data(base_path, output_path)
Renamed columns for year 2013: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2007: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2006: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2012: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2004: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2010: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2011: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2005: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2001: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2015: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2014: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2000: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2016: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2002: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2003: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2017: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2019: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2018: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2008: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2020: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2021: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2009: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2023: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2022: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
def avg_loneliness(df, year):
#scaling from 0 to 1
#divide all values by 4
df['LONELY'] = (df['LONELY']- 1) /4
#add all values together then divide by the number of values
return df['LONELY'].mean()
def avg_pol_beliefs(df, year):
#scaling from 0 to 1
#divide all values by 4
df['POL_BELIEFS'] = (df['POL_BELIEFS'] - 1) /4
#add all values together then divide by the number of values
return df['POL_BELIEFS'].mean()
def avg_sibling_count(df, year):
return df["NUM_SIBS"].mean()
def avg_wish_frnds(df,year):
df['WISH_MORE_FRNDS'] = (df['WISH_MORE_FRNDS'] - 1)/4
return df['WISH_MORE_FRNDS'].mean()
def avg_uslly_frnds(df,year):
df['USLLY_FRNDS'] = (df['USLLY_FRNDS']- 1)/4
return df['USLLY_FRNDS'].mean()
def avg_fathr_pres(df,year):
return df['FATHR_PRES'].mean()
def avg_mothr_pres(df,year):
return df['MOTHR_PRES'].mean()
#boolean to a numeric value for father and pres
base_path = Path("/Users/yipho/eecs398/portfolio/processed_data")
data = {
"Year": [],
"Average Loneliness": [],
"Average Political Beliefs": [],
"Average Sibling Count": [],
"Average Wish More Friends": [],
"Average Usually Friends": [],
"Average Father Presence": [],
"Average Mother Presence": [],
}
for year in range(2000, 2024):
path = base_path / f"data_{year}.csv"
df = pd.read_csv(path)
data["Year"].append(year)
data["Average Loneliness"].append(avg_loneliness(df, year))
data["Average Political Beliefs"].append(avg_pol_beliefs(df, year))
data["Average Sibling Count"].append(avg_sibling_count(df, year))
data["Average Wish More Friends"].append(avg_wish_frnds(df, year))
data["Average Usually Friends"].append(avg_uslly_frnds(df, year))
data["Average Father Presence"].append(avg_fathr_pres(df, year))
data["Average Mother Presence"].append(avg_mothr_pres(df, year))
summary_df = pd.DataFrame(data)
summary_df
variables = [
"Average Loneliness",
"Average Political Beliefs",
"Average Sibling Count",
"Average Wish More Friends",
"Average Usually Friends"
]
for var in variables:
fig = go.Figure()
fig.add_trace(go.Scatter(
x=summary_df["Year"],
y=summary_df[var],
mode='lines+markers',
name=var
))
# Update layout
fig.update_layout(
title=f"{var} Over Time (2000-2023)",
xaxis_title="Year",
yaxis_title=var,
template="plotly_white"
)
fig.show("notebook")
base_path = Path("/Users/yipho/eecs398/portfolio/unprocessed_data")
# drop -9 in lonely and pol_belief
def clean_data(df):
df = df[df["LONELY"].isin([-9, 8]) == False]
df = df[df["POL_BELIEFS"].isin([-9, 8]) == False]
return df
data = {
"Year": [],
"Average Loneliness": [],
"Average Political Beliefs": [],
"Average Sibling Count": [],
"Average Wish More Friends": [],
"Average Usually Friends": [],
"Average Father Presence": [],
"Average Mother Presence": [],
}
for year in range(2000, 2024):
path = base_path / f"data_{year}.csv"
df = pd.read_csv(path)
df = clean_data(df)
data["Year"].append(year)
data["Average Loneliness"].append(avg_loneliness(df, year))
data["Average Political Beliefs"].append(avg_pol_beliefs(df, year))
data["Average Sibling Count"].append(avg_sibling_count(df, year))
data["Average Wish More Friends"].append(avg_wish_frnds(df, year))
data["Average Usually Friends"].append(avg_uslly_frnds(df, year))
data["Average Father Presence"].append(avg_fathr_pres(df, year))
data["Average Mother Presence"].append(avg_mothr_pres(df, year))
summary_df = pd.DataFrame(data)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=summary_df["Year"],
y=summary_df["Average Political Beliefs"],
mode='lines+markers',
name="Average Political Beliefs"
))
fig.add_trace(go.Scatter(
x=summary_df["Year"],
y=summary_df["Average Loneliness"],
mode='lines+markers',
name="Average Loneliness"
))
fig.update_layout(
title="Trends in Average Political Beliefs and Loneliness Over Time (2000-2023)",
xaxis_title="Year",
yaxis_title="Scaled Value",
template="plotly_white",
legend_title="Variables",
yaxis=dict(range=[0, 1]) # Set y-axis range from 0 to 1
)
fig.show("notebook")
df23 = pd.read_csv("processed_data/data_2023.csv")
pivot_table = pd.crosstab(df23["LONELY"], df23["POL_BELIEFS"])
pivot_table
| POL_BELIEFS | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| LONELY | |||||
| 0 | 16 | 35 | 32 | 9 | 4 |
| 1 | 7 | 28 | 53 | 40 | 17 |
| 2 | 17 | 34 | 46 | 45 | 17 |
| 3 | 11 | 34 | 39 | 46 | 18 |
| 4 | 9 | 21 | 33 | 36 | 9 |
SECOND PLOT, BIVAR ANALYSIS 1¶
fig = px.density_heatmap(
df23,
x='POL_BELIEFS',
y='LONELY',
color_continuous_scale='Viridis',
title="Bivariate Analysis of Political Beliefs and Loneliness (Heatmap Example)",
)
fig.show("notebook")
THIRD PLOT, BIVAR ANALYSIS 2¶
fig = px.box(df23, x = "SEX", y = "USLLY_FRNDS", title = "Boxplot", labels = {"SEX": "Sex", "USLLY_FRNDS": "Consistent Group of Friends?"})
fig.show("notebook")
Interesting Aggregates!¶
pivot_table = pd.crosstab(df23['LONELY'], df23['WISH_MORE_FRNDS'])
pivot_table
| WISH_MORE_FRNDS | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| LONELY | |||||
| 0 | 48 | 18 | 12 | 10 | 8 |
| 1 | 37 | 45 | 14 | 32 | 17 |
| 2 | 23 | 29 | 48 | 39 | 20 |
| 3 | 21 | 28 | 15 | 50 | 34 |
| 4 | 12 | 11 | 7 | 25 | 53 |
Step 3: Framing a Prediction Problem¶
From the previous sections (the dual lineplot), it is clear that there is some correlation or convergence between how lonely people feel and their political leaning. Due to this, we want to explore any possible way to predict a student's political leaning using these indicators. This naturally leads us to explore classification algorithms and how we may use multiclass classification to identify a student's political disposition.
Formally, we are trying to use relevant variables that indicate the state of a responder's social network, based on a survey, to train a multiclass classification model. On the micro scale, the algorithm would allow us to predict the political leaning of a single respondent, but on the macro scale, we can observe the sentiment of the entire class as a whole. In order to see how the overall sentiment of 12th graders changes over time, we must focus on the macro scale. Thus, our prediction problem is as follows: Can we predict the overall political leaning of the class of 12th graders based on each individual's social network state?
Because the data is constructed in a way that prevents overlaps of conflicting categorical data points, we don't need to modify our cleaned data for the purposes of the baseline model. In line with best model-building practices, we will use a 70-30 split, with 70% for training and the remaining 30% for testing. Since we are using a multiclass classification model, we will evaluate performance using accuracy and the F1 score for simplicity and clarity.
Step 4: Baseline Model¶
# import all the necessary tools
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
Building the baseline Model¶
# TODO
def getresbasic(year):
# 1. Load the processed data for all years
try:
dfpred = pd.read_csv(f"processed_data/data_{year}.csv")
except:
print(f"Error loading data for year {year}")
return None
# get the right data that we need
goal = dfpred["POL_BELIEFS"]
Pred = dfpred[["BR_SR_inhouse","LONELY","WISH_MORE_FRNDS"]]
# 2. Split the data
X_train, X_test, y_train, y_test = train_test_split(Pred,goal , random_state=100, test_size=0.20, shuffle=True)
# 3. Train the model
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)
# 4. Predict stuff
y_pred = rf.predict(X_test)
# 5. Evaluate the model
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))
# Return nothing because we won't use anything form this model
return None
Step 5: Final Model¶
#Build the Final Model
def getresfinal(year):
# try to load in the cleaned data
path = f'/Users/yipho/eecs398/portfolio/unprocessed_data/data_{year}.csv'
try:
dfpred = pd.read_csv(path)
except:
print(f"Error loading data for year {year}")
return None
#subset all the invalid POL_BELIEFS data
dfprednew = dfpred[dfpred["POL_BELIEFS"].isin([6, 8, -9]) == True]
dfpred = dfpred[dfpred["POL_BELIEFS"].isin([6, 8, -9]) == False]
# Update the columns to be used in the model
#Parents Collumn
def make_parents(df):
df["PARENTS_PRES"] = df["MOTHR_PRES"] + df["FATHR_PRES"]
df = df.drop(columns=["MOTHR_PRES", "FATHR_PRES"])
return df
def onehot(df):
possible_values = [1, 2, 3, 4, 5]
dummies = pd.get_dummies(df['LONELY'], prefix="LONELY")
for value in possible_values:
column_name = f"LONELY_{value}"
if column_name not in dummies.columns:
dummies[column_name] = False
df = pd.concat([df, dummies], axis=1)
return df
dfpred = make_parents(dfpred)
dfpred = onehot(dfpred)
maingoal = dfpred["POL_BELIEFS"]
Pred = dfpred[["BR_SR_inhouse","LONELY_1","LONELY_2","LONELY_3","LONELY_4","LONELY_5","WISH_MORE_FRNDS",
"USLLY_FRNDS","NUM_SIBS","PARENTS_PRES","SEX"]]
# 2. Split the data
X_train, X_test, y_train, y_test = train_test_split(Pred, maingoal, random_state=100, test_size=0.20, shuffle=True)
# Set Grid Search Parameters
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [10, 20, 30, 40, 50]}
rf = RandomForestClassifier(random_state=78)
# 3. Train the model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
best_rf.fit(Pred, maingoal)
y_pred = best_rf.predict(X_test)
Accuracy = metrics.accuracy_score(y_test, y_pred)
F1 = metrics.f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", Accuracy)
print("F1 Score:", F1)
# Predict for the invalid data
dfprednew = make_parents(dfprednew)
dfprednew = onehot(dfprednew)
Pred = dfprednew[["BR_SR_inhouse","LONELY_1","LONELY_2","LONELY_3","LONELY_4","LONELY_5","WISH_MORE_FRNDS",
"USLLY_FRNDS","NUM_SIBS","PARENTS_PRES","SEX"]]
y_pred = best_rf.predict(Pred)
# Add the new y_pred to the original goal
Total = pd.concat([maingoal, pd.Series(y_pred)], axis=0)
# Find the avg total
return Total.mean(), year, Accuracy, F1
from sklearn.multioutput import MultiOutputClassifier
meanarr = []
yeararr = []
Accarr = []
F1arr = []
for year in range(2000, 2024):
try:
mean, year, Acc, F1 = getresfinal(year)
meanarr.append(mean)
yeararr.append(year)
Accarr.append(Acc)
F1arr.append(F1)
except KeyError as e:
print(f"KeyError for year {year}: {e}")
continue
meanarr = np.array(meanarr)
yeararr = np.array(yeararr)
Accarr = np.array(Accarr)
F1arr = np.array(F1arr)
FinalRes = pd.DataFrame({"Year": yeararr, "Mean": meanarr, "Accuracy": Accarr, "F1": F1arr})
Accuracy: 0.7718446601941747 F1 Score: 0.7506777424960853 Accuracy: 0.7894736842105263 F1 Score: 0.783077357747177 Accuracy: 0.7916666666666666 F1 Score: 0.7842150422475913 Accuracy: 0.7093023255813954 F1 Score: 0.6903324576730157 Accuracy: 0.7316176470588235 F1 Score: 0.7159800786023385 Accuracy: 0.6915254237288135 F1 Score: 0.6834058311804885 Accuracy: 0.7269372693726938 F1 Score: 0.7208770250081299 Accuracy: 0.6635220125786163 F1 Score: 0.644642247097599 Accuracy: 0.760797342192691 F1 Score: 0.7489117527052044 Accuracy: 0.6923076923076923 F1 Score: 0.6786650420458517 Accuracy: 0.7362637362637363 F1 Score: 0.7292266466363624 Accuracy: 0.8154761904761905 F1 Score: 0.8135929980466247 Accuracy: 0.75 F1 Score: 0.7396549453851684 Accuracy: 0.7177914110429447 F1 Score: 0.708349825186358 Accuracy: 0.7142857142857143 F1 Score: 0.7167513144341543 Accuracy: 0.7243589743589743 F1 Score: 0.7251534199990741 Accuracy: 0.7388535031847133 F1 Score: 0.7366251003298095 Accuracy: 0.7341040462427746 F1 Score: 0.723464871537565 Accuracy: 0.7388888888888889 F1 Score: 0.7350285361208926 Accuracy: 0.7771084337349398 F1 Score: 0.7767850983164013 Accuracy: 0.8571428571428571 F1 Score: 0.8557182613305061 Accuracy: 0.7368421052631579 F1 Score: 0.7336869478810817 Accuracy: 0.8012422360248447 F1 Score: 0.7985109614522304 Accuracy: 0.7902097902097902 F1 Score: 0.7876139688329625
reFinRes = FinalRes
reFinRes
reFineRes1 = pd.DataFrame(reFinRes)
reFineRes1
| Year | Mean | Accuracy | F1 | |
|---|---|---|---|---|
| 0 | 2000 | 3.04 | 0.77 | 0.75 |
| 1 | 2001 | 3.03 | 0.79 | 0.78 |
| 2 | 2002 | 3.12 | 0.79 | 0.78 |
| ... | ... | ... | ... | ... |
| 21 | 2021 | 2.99 | 0.74 | 0.73 |
| 22 | 2022 | 3.25 | 0.80 | 0.80 |
| 23 | 2023 | 3.04 | 0.79 | 0.79 |
24 rows × 4 columns
reFineRes1['Mean'] = (reFineRes1['Mean'] - 1) /4
fig = px.line(reFineRes1.reset_index(), x="Year", y="Mean",
title="Mean of Political Beliefs Over Time",
labels={"Mean": "Mean of Political Beliefs", "Year": "Year"})
fig.update_yaxes(range=[0, 1])
fig.update_yaxes(tickvals=[0, 0.5, 1],
ticktext=["Conservative", "Moderate", "Liberal"])
fig.show("notebook")
fig = go.Figure()
fig.add_trace(go.Scatter(
x=summary_df["Year"],
y=summary_df["Average Political Beliefs"],
mode='lines+markers',
name="Actual Mean of Political Beliefs"
))
fig.add_trace(go.Scatter(
x=reFineRes1["Year"],
y=reFineRes1["Mean"],
mode='lines+markers',
name="Predicted Mean of Political Beliefs"
))
fig.update_layout(
title="Actual vs. Predicted Mean of Political Beliefs Over Time",
xaxis_title="Year",
yaxis_title="Mean of Political Beliefs",
yaxis=dict(range=[0, 1]),
template="plotly_white",
legend_title="Metrics"
)
fig.show("notebook")